# extract_results.py
# Extracts TD-DFT results from output files and creates Excel/CSV summaries
# Creates publication-ready tables for Harvard Dataverse submission

import pandas as pd
import re
from pathlib import Path

print("="*70)
print("EXTRACTING TD-DFT RESULTS FOR DATA REPOSITORY")
print("="*70)
print()

# Experimental reference
EXP_WAVELENGTH = 468.0  # nm
EXP_ENERGY = 1239.84 / EXP_WAVELENGTH  # eV

# Initialize results dictionary
results = {
    'Functional': [],
    'Functional_Class': [],
    'Excitation_Energy_eV': [],
    'Wavelength_nm': [],
    'Oscillator_Strength': [],
    'Delta_Wavelength_nm': [],
    'Abs_Deviation_nm': [],
    'State_Label': []
}

# Functional classifications
functional_classes = {
    'B3LYP': 'Global hybrid (20% HF)',
    'PBE0': 'Global hybrid (25% HF)',
    'CAM-B3LYP': 'Range-separated hybrid',
    'WB97X-D': 'Range-separated hybrid + dispersion'
}

# Map functional names to output file patterns
functional_files = {
    'B3LYP': 'tddft_b3lyp.out',
    'PBE0': 'tddft_pbe0.out',
    'CAM-B3LYP': 'tddft_camb3lyp.out',
    'WB97X-D': 'tddft_wb97xd.out'
}

# Parse each output file
print("Parsing output files...")
for func_name, filename in functional_files.items():
    filepath = Path(filename)
    
    if not filepath.exists():
        print(f"  ⚠ {filename} not found, skipping...")
        continue
    
    print(f"  Reading {filename}...")
    
    with open(filepath, 'r') as f:
        content = f.read()
    
    # Extract bright state information
    # Look for the line with "←" marker indicating first bright state
    bright_state_pattern = r'S(\d+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+bright\s*←'
    match = re.search(bright_state_pattern, content)
    
    if match:
        state_num = match.group(1)
        energy_ev = float(match.group(2))
        wavelength_nm = float(match.group(3))
        osc_strength = float(match.group(4))
        
        delta_nm = wavelength_nm - EXP_WAVELENGTH
        abs_dev = abs(delta_nm)
        
        results['Functional'].append(func_name)
        results['Functional_Class'].append(functional_classes[func_name])
        results['Excitation_Energy_eV'].append(energy_ev)
        results['Wavelength_nm'].append(wavelength_nm)
        results['Oscillator_Strength'].append(osc_strength)
        results['Delta_Wavelength_nm'].append(delta_nm)
        results['Abs_Deviation_nm'].append(abs_dev)
        results['State_Label'].append(f"S{state_num}")
        
        print(f"    ✓ Extracted S{state_num}: λ = {wavelength_nm:.1f} nm, Δλ = {delta_nm:+.1f} nm")
    else:
        print(f"    ⚠ Could not extract bright state data")

print()

# Create DataFrame
df = pd.DataFrame(results)

# Add experimental reference row
exp_row = {
    'Functional': 'Experiment',
    'Functional_Class': 'Reference',
    'Excitation_Energy_eV': EXP_ENERGY,
    'Wavelength_nm': EXP_WAVELENGTH,
    'Oscillator_Strength': None,
    'Delta_Wavelength_nm': 0.0,
    'Abs_Deviation_nm': 0.0,
    'State_Label': '—'
}
df = pd.concat([df, pd.DataFrame([exp_row])], ignore_index=True)

# Sort by absolute deviation
df_sorted = df.sort_values('Abs_Deviation_nm').reset_index(drop=True)

print("="*70)
print("BENCHMARK SUMMARY")
print("="*70)
print(df_sorted.to_string(index=False))
print()

# Save to Excel with multiple sheets
excel_filename = 'excitation_energies_summary.xlsx'
print(f"Saving results to {excel_filename}...")

with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
    # Sheet 1: Main results
    df_sorted.to_excel(writer, sheet_name='Benchmark_Results', index=False)
    
    # Sheet 2: Sorted by wavelength
    df_by_wavelength = df.sort_values('Wavelength_nm').reset_index(drop=True)
    df_by_wavelength.to_excel(writer, sheet_name='By_Wavelength', index=False)
    
    # Sheet 3: Metadata
    metadata = pd.DataFrame({
        'Parameter': [
            'System',
            'Method',
            'Basis Set',
            'Solvation Model',
            'Dielectric Constant',
            'Experimental λ_max',
            'Experimental Source',
            'Calculation Date',
            'Software'
        ],
        'Value': [
            'Camphorquinone (C10H14O2)',
            'TD-DFT (vertical excitations)',
            '6-31G(d)',
            'C-PCM',
            '4.0',
            '468.0 nm',
            'Chen et al., Dent Mater (2007)',
            '2025',
            'PySCF 2.2.0'
        ]
    })
    metadata.to_excel(writer, sheet_name='Metadata', index=False)

print(f"  ✓ {excel_filename} created")
print()

# Save CSV for simple analysis
csv_filename = 'lambda_deviation_analysis.csv'
print(f"Saving CSV to {csv_filename}...")
df_sorted.to_csv(csv_filename, index=False, float_format='%.4f')
print(f"  ✓ {csv_filename} created")
print()

# Generate LaTeX table for manuscript
print("="*70)
print("LATEX TABLE (for manuscript)")
print("="*70)
print()

latex_table = df_sorted[df_sorted['Functional'] != 'Experiment'].to_latex(
    index=False,
    columns=['Functional', 'Excitation_Energy_eV', 'Wavelength_nm', 
             'Oscillator_Strength', 'Delta_Wavelength_nm'],
    header=['Functional', 'E (eV)', 'λ (nm)', 'f', 'Δλ (nm)'],
    float_format='%.3f',
    caption='TD-DFT benchmark results for camphorquinone S₀→S₁ excitation',
    label='tab:tddft_benchmark'
)
print(latex_table)

# Save LaTeX table
with open('table_benchmark_latex.tex', 'w') as f:
    f.write(latex_table)
print("✓ LaTeX table saved to table_benchmark_latex.tex")
print()

# Statistical summary
print("="*70)
print("STATISTICAL SUMMARY")
print("="*70)
df_calc = df[df['Functional'] != 'Experiment']
print(f"Mean absolute deviation: {df_calc['Abs_Deviation_nm'].mean():.2f} nm")
print(f"Std deviation: {df_calc['Abs_Deviation_nm'].std():.2f} nm")
print(f"Best functional: {df_calc.loc[df_calc['Abs_Deviation_nm'].idxmin(), 'Functional']}")
print(f"Worst functional: {df_calc.loc[df_calc['Abs_Deviation_nm'].idxmax(), 'Functional']}")
print()

print("="*70)
print("DATA EXTRACTION COMPLETE")
print("="*70)
print()
print("Files generated for Harvard Dataverse:")
print("  ✓ excitation_energies_summary.xlsx")
print("  ✓ lambda_deviation_analysis.csv")
print("  ✓ table_benchmark_latex.tex")
print()
